# coding=utf-8
from locale import *
import re
import sys
import datetime
from selenium import webdriver
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup


def get_mac_address():
    import uuid
    node = uuid.getnode()
    mac = uuid.UUID(int=node).hex[-12:]
    return mac


def validate_mac_address():
    import urllib.request
    f = urllib.request.urlopen('http://amazon-ceping.xunhuanle.com/publicwelcome/getallmacaddress')
    ret_content_bytes = f.read()
    ret_content_str = ret_content_bytes.decode()
    return ret_content_str

def set_driver():
    # 设置不加载图片
    firefoxProfile = FirefoxProfile()
    firefoxProfile.set_preference('permissions.default.image', 2)
    firefoxProfile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
    driver = webdriver.Firefox(firefoxProfile)
    return driver

def run_crawler(url, run_times, input_ping_lun_num, input_price_begin, input_price_end, input_sort_end, open_file):
    driver = set_driver()
    driver.get(url)
    i = 1
    while i < run_times:
        i += 1
        current_list_url = driver.current_url
        print('CURRENT LIST PAGE URL:' + current_list_url)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        allItems = soup.select('li.s-result-item')
        try:
            #尝试找下一页按钮，找到了进入下一页，出现异常，就看是不是到最后一页了，如果是，就找下一个词的。
            WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.ID, 'pagnNextLink')))
            nextPageUrl = driver.find_element_by_id("pagnNextLink").get_attribute("href")
            #print('NEXT LIST PAGE URL: '+nextPageUrl)
        except Exception:
            try:
                WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, 'proceedWarning')))
                #proceedWarning = driver.find_element_by_class_name("proceedWarning")
                print('This is the last page,We go to next type of link!')
                driver.quit()
                break
            except Exception:
                print("can't find the last page tips!!! We try to find captchacharacters ID")
                # 这里需要判断是不是出现验证页面，如果是先报警，提醒人工验证
                try:
                    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.ID, 'captchacharacters')))
                    #运行到这，说明出现了验证码，先关闭浏览器，并重启显示图片的浏览器。以方便显示输入验证
                    valid_url = driver.current_url
                    print("VALID_URL=", valid_url)
                    driver.quit()
                    firefox_profile = FirefoxProfile()
                    firefox_profile.set_preference('permissions.default.image', 1)
                    firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'true')
                    driver = webdriver.Firefox(firefox_profile)
                    driver.get(valid_url)
                    #这里寻找一个不存在的元素,设置等待60秒，以留时间输入验证码
                    #continue
                except Exception:
                    print("Unknow Error!!!")
                    driver.quit()
                    break
        for item in allItems:
            span_name = item.select_one('span[name]')
            if span_name is not None:
                asin_text = span_name.get("name")
                star_ele = span_name.select_one('i.a-icon-star > span')
                if star_ele is not None:
                    star_text = star_ele.get_text()
                else:
                    star_text = "Null"
                pingLun_ele = span_name.select_one(' + a')
                if pingLun_ele is not None:
                    pingLun_text = pingLun_ele.get_text()
                    pingLunNum = int(atof(pingLun_text))
                    # 获取URL
                    m_url = item.select_one('h2').parent.get("href")
                    if not m_url.startswith('http'):
                        m_url = 'https://www.amazon.com'+m_url
                    print('HANDLE PRODUCT URL:', m_url)
                    # 判断评论数是否是小于指定评论数的，如果是，再判断价格是否在区间中，如果是请求他的url进一步判断排名
                    if pingLunNum <= input_ping_lun_num:
                        price_ele = item.select_one('span.sx-price-whole')
                        if price_ele is None:
                            continue
                        price_get = int(price_ele.get_text())
                        # 价格区间,在价格区间中的才处理
                        if input_price_begin < price_get < input_price_end:
                            i += 1
                            if i >= run_times:
                                # 重新设置driver
                                i = 1
                                driver.quit()
                                driver = set_driver()
                            driver.get(m_url.strip())
                            soup_validate = BeautifulSoup(driver.page_source, "html.parser")
                            sales_rank = soup_validate.select_one('#SalesRank').get_text().strip()
                            print('SALES_RANK:', sales_rank)
                            rank_tuple = re.findall(r'[\d+\,?\d*]+', sales_rank)
                            if rank_tuple.__len__() == 0:
                                continue
                            rank_num = atof(rank_tuple[0])
                            rank_text = rank_tuple[0]
                            print("RANK NUM:", rank_num)
                            if int(rank_num) < input_sort_end:
                                print('NEED URL {} WRITE TO FILE'.format(m_url))
                                file = open(open_file, 'a', encoding='utf-8')
                                file.writelines(m_url + '\t' + asin_text + '\t' + star_text + '\t' + pingLun_text + '\t' + rank_text + '\t' + sales_rank + '\n')
                                file.close()
                            else:
                                print('RANK NUM {} IS GREATER THAN {}, NOT RECORD!!!'.format(rank_num, input_sort_end))
                        else:
                            print('THE PRICE {} NOT IN RANGE[{},{}]'.format(price_get, input_price_begin, input_price_end))
                    else:
                        print('PINGLUN NUM:' + pingLun_text + ' Comments number is too high to need!')
        if i < run_times:
            driver.get(nextPageUrl)
        else:
            driver.quit()
            run_crawler(nextPageUrl, input_ping_lun_num, input_price_begin, input_price_end, input_sort_end, open_file)







mac_address = get_mac_address()
print("Your macaddress is below:")
print(mac_address)
validation_content = validate_mac_address()
if mac_address not in validation_content:
    print("Please submit your unicode '"+mac_address+"' to administrator!!!")
    sys.exit()

file_name = "found_product_"+datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S')+".xls"

#要抓取的url
search_url = input("Input Search URL:")
print("Search url is:", search_url)

#过滤掉评论数多于给定的值的（设置上限）
inputPingLunNum = input("Comments Number Highest:")
inputPingLunNum = inputPingLunNum.strip()
print("Comments Number Highest is:", inputPingLunNum)

#价格下限、上限
inputPriceBegin = input("Price Begin:")
inputPriceBegin = inputPriceBegin.strip()
inputPriceEnd = input("Price End:")
inputPriceEnd = inputPriceEnd.strip()
print("Price Begin With:{},End With:{}".format(inputPriceBegin, inputPriceEnd))

#产品排名
inputSortEnd = input("Ranking Highest:")
inputSortEnd = inputSortEnd.strip()
print("Ranking End With:", inputSortEnd)

#运行请求多少次链接后重启浏览器
runTimes = input("Run Times Every Cycle:")
runTimes = runTimes.strip()
print("Run Times Every Cycle is:", runTimes)

setlocale(LC_NUMERIC, 'English_US')

run_crawler(search_url, int(runTimes), int(inputPingLunNum), int(inputPriceBegin), int(inputPriceEnd), int(inputSortEnd), file_name)






